Introduction

This was an early look at the Hard to Count scores from the census planning database, and an experiment in using tmap.

Note you will need to set the global settings to use the project directory when knitting, as documented in figure 16.1: https://bookdown.org/yihui/rmarkdown-cookbook/working-directory.html.

Initialize with libraries and functions

Note, geneorama is needed, and can be installed with devtools::install_github("geneorama/geneorama"). It’s only needed for a handful of functions, but there are also some included maps, like community areas.

##==============================================================================
## INITIALIZE
##==============================================================================

# rm(list=ls())
library("geneorama")
library("data.table")
library("rgeos")
library("leaflet")
library("colorspace")
library("sp")
library("rgdal")
library("RColorBrewer")
library("magrittr")

## Not used, this time:
# library("spdep")
# library("ggplot2")
# library("bit64")

## Load the functions in the functions directory with sourceDir
sourceDir("functions/")

Import Community Areas and Census Tracts

The CA data is from the geneorama package. The census tracts are from another repository since they were not released by the census at the time this document was created.

##==============================================================================
## IMPORT DATA
##==============================================================================

data("chi_community_areas")

## 2020 Tract data
shp_tracts_2020 <- readRDS("data_maps_census_2020/tracts_2020_stuartlynn_Cook.Rds")
## Subset Tract data to Chicago
shp_tracts_2020 <- shp_tracts_2020[!is.na(shp_tracts_2020$community), ]
shp_tracts_2020$GEOID <- substr(shp_tracts_2020$GEO_ID, 10, 20)

Crosswalks

Cross walks are needed to translate 2020 response rates to 2019 tracts. Only a few tracts changed, but there was no way to know how many or if the effect was concentrated ahead of time.

Also tracts need to be allocated to ward. Ward boundaries cut tracts in half often.

To do the allocation we used a private data set from Replica HQ, which has simulated population data at an individual level. The person level data is aggregated to household based on location, and those households are geocoded to geographies to come up with the translations between geographies.

##------------------------------------------------------------------------------
## Tract crosswalk based on replica data
##------------------------------------------------------------------------------

to2020 <- fread("data_census_planning/crosswalk_from_2020.csv")
to2020[ , TRACT_2020 := as.character(TRACT_2020)]
to2020[ , TRACT_prev := as.character(TRACT_prev)]
to2020[ , TRACT_2020 := substr(TRACT_2020,6,11)]
to2020[ , TRACT_prev := substr(TRACT_prev,6,11)]

to2020 <- to2020[ , list(TRACT_2020=TRACT_2020[which.max(allocation)]), TRACT_prev]

##------------------------------------------------------------------------------
## Ward crosswalk based on replica data
##------------------------------------------------------------------------------
toWard <- fread("data_census_planning/crosswalk_replica_based.csv")
toWard[ , TRACT:=substr(TRACT,6,11)]
toWard <- toWard[!is.na(TRACT)]
# toWard <- dcast(toWard, TRACT~ward, value.var = "allocation", fill = 0)

Response rate data

Load the locally captured response rate data. This is collected from the Census API on a daily basis using the script in this package, and a cron job.

##------------------------------------------------------------------------------
## Locally collected responses for 2020
##------------------------------------------------------------------------------
resp_filename <- max(list.files(path = "data_daily_resp_cook",
                                pattern = "^cook.+csv$", full.names = T))
resp_current <- fread(resp_filename)
resp_current[ , tract := NULL]
resp_current[ , GEOID := substr(GEO_ID, 10, 20)]
resp_current[ , TRACT := substr(GEO_ID, 15, 20)]
resp_current <- resp_current[match(shp_tracts_2020$TRACT, TRACT)]

## File name for current report:
resp_filename
## [1] "data_daily_resp_cook/cook 2020-09-07.csv"
## Head of current response rate data
head(resp_current)
##                  GEO_ID  RESP_DATE DRRALL CRRALL DRRINT CRRINT state county
## 1: 1400000US17031380200 2020-09-04    0.2   65.4    0.1   26.3    17     31
## 2: 1400000US17031611500 2020-09-04    0.1   42.1    0.1   24.2    17     31
## 3: 1400000US17031711100 2020-09-04    0.0   63.6    0.0   35.0    17     31
## 4: 1400000US17031420300 2020-09-04    0.0   54.4    0.0   48.6    17     31
## 5: 1400000US17031283200 2020-09-04    0.0   68.3    0.0   62.1    17     31
## 6: 1400000US17031590600 2020-09-04    0.1   52.4    0.1   45.3    17     31
##          GEOID  TRACT
## 1: 17031380200 380200
## 2: 17031611500 611500
## 3: 17031711100 711100
## 4: 17031420300 420300
## 5: 17031283200 283200
## 6: 17031590600 590600

Planning database

The planning database contains many tract level variables on populations across ACS and Census surveys. It was available among the census 2020 planning materials.

##------------------------------------------------------------------------------
## pdb file from CB
## Use cross walk to link to 2020 tracts
##------------------------------------------------------------------------------
# file.copy("../data-census-planning/pdb2019bgv6_us.csv",
#           "data_census_planning/")
# str(pdb)

## NOTE: MANUALLY LIMITED TO COOK COUNTY FOR GITHUB SIZE LIMIT

pdb <- fread("data_census_planning/pdb2019bgv6_cook.csv", keepLeadingZeros = T)
pdb <- pdb[State=="17" & County == "031"]

# table(unique(pdb$Tract) %in% to2020$TRACT_prev)
# table(unique(pdb$Tract) %in% to2020$TRACT_2020)
pdb[ , TRACT_2020 := to2020[match(pdb$Tract, TRACT_prev), TRACT_2020]]

Testing overlap

Test overlap and coverage between shapefile and planning database:

table(shp_tracts_2020$TRACT %in% pdb$TRACT_2020)
## 
## FALSE  TRUE 
##    12   779
table(pdb$TRACT_2020 %in% shp_tracts_2020$TRACT)
## 
## FALSE  TRUE 
##  1847  2146
table(unique(pdb$TRACT_2020) %in% shp_tracts_2020$TRACT)
## 
## FALSE  TRUE 
##     1   779

Although the overlap isn’t perfect, other analysis indicates that the errors are not significant. Most of tracts with error are unpopulated, or very sparsely populated; for example airports.

Subset Planning DB to maptch tract shapefile

pdb <- pdb[pdb$TRACT_2020 %in% shp_tracts_2020$TRACT]
dim(pdb)
## [1] 2146  347

More checks: Population and household totals:

## Estimates of total population
sum(pdb$Tot_Population_CEN_2010)     ## 2,695,249
## [1] 2644979
sum(pdb$Tot_Population_ACS_13_17)    ## 2,722,098
## [1] 2669418
sum(pdb$Tot_Population_ACSMOE_13_17) ##   705,132 (?)
## [1] 693327
## Estimates for total households
sum(pdb$Tot_Housing_Units_CEN_2010)     ## 1,194,116
## [1] 1164503
sum(pdb$Tot_Housing_Units_ACS_13_17)    ## 1,200,059
## [1] 1168850
sum(pdb$Tot_Housing_Units_ACSMOE_13_17) ##   203,658 (?)
## [1] 198935

Geocode to Community Area

Geocode tracts to Community Area for the shapefile, then match the results to the other planning and response rate files.

##------------------------------------------------------------------------------
## Get community area
##------------------------------------------------------------------------------

## Geocode tracts to Community Area
shp_tracts_2020$community <- geocode_to_map(shp_tracts_2020$lat_centroid,
                                            shp_tracts_2020$lon_centroid,
                                            map = chi_community_areas,
                                            map_field_name = "community")
## Match to planning database
pdb$community <- shp_tracts_2020$community[match(pdb$TRACT_2020, shp_tracts_2020$TRACT)]

## Match to current response
resp_current$community <- shp_tracts_2020$community[match(resp_current$TRACT, 
                                                          shp_tracts_2020$TRACT)]

Aggreagate Planning DB to Household

The census is conducted at a household level, so aggregage key statistics of the planning db to a household level.

Once this is done, join in the response rate data so that everything is in one place.

##------------------------------------------------------------------------------
## Aggregate planning database statistics to 2020 tract
##------------------------------------------------------------------------------

pdb_household <- pdb[i = TRUE,
                     j = list(households = sum(Tot_Housing_Units_ACS_13_17),
                              Tot_Population_ACS_13_17 = sum(Tot_Population_ACS_13_17),
                              Tot_Occp_Units_ACS_13_17 = sum(Tot_Occp_Units_ACS_13_17),
                              Hispanic_ACS_13_17 = sum(Hispanic_ACS_13_17),
                              pct_hisp = sum(Hispanic_ACS_13_17) / sum(Tot_Population_ACS_13_17)),
                     keyby = list(TRACT = TRACT_2020)]
pdb_household
##       TRACT households Tot_Population_ACS_13_17 Tot_Occp_Units_ACS_13_17
##   1: 010100       2626                     4444                     2248
##   2: 010201       2994                     7197                     2670
##   3: 010202       1216                     2487                      976
##   4: 010300       3395                     6413                     2982
##   5: 010400       2218                     5411                     1870
##  ---                                                                    
## 775: 843700       1004                     2549                      963
## 776: 843800        847                     1699                      630
## 777: 843900       2267                     3332                     1842
## 778: 980000          0                        0                        0
## 779: 980100          0                        0                        0
##      Hispanic_ACS_13_17   pct_hisp
##   1:                410 0.09225923
##   2:               1627 0.22606642
##   3:                717 0.28829916
##   4:               1204 0.18774365
##   5:                390 0.07207540
##  ---                              
## 775:                680 0.26677128
## 776:                409 0.24072984
## 777:                108 0.03241297
## 778:                  0        NaN
## 779:                  0        NaN
## Join in response rates and community area names
ii <- match(pdb_household$TRACT, resp_current$TRACT)
pdb_household$response <- resp_current[ii, CRRALL]
pdb_household$community <- resp_current[ii, community]

Basic metrics

One of the big questions was how language barriers would affect outreach, so we looked at how many households are Hispanic. Other languages were also examined, but this was a good starting point.

hist(pdb_household$pct_hisp)

What is the distribution of the current response rate?

hist(pdb_household$response)

Summary by community area

##------------------------------------------------------------------------------
## Aggregate response data to community area
##------------------------------------------------------------------------------
summary_community <- pdb_household[
  i = !is.na(response),
  j = list(resp = round(sum(response * households)/sum(households)/100, 2),
           pop = sum(Tot_Population_ACS_13_17),
           occ_households = sum(Tot_Occp_Units_ACS_13_17),
           tot_households = sum(households),
           hisp_pct = round(sum(pct_hisp * households) / sum(households), 2)),
  keyby = community]

setnames(summary_community, gsub("_"," ",colnames(summary_community)))
setnames(summary_community, capwords(colnames(summary_community)))
# clipper(summary_community)  ## COPY TO CLIPBOARD FOR EXCEL
summary_community
##                  Community Resp   Pop Occ Households Tot Households Hisp Pct
##  1:                   <NA> 0.63 69100          40300          45426     0.07
##  2:            ALBANY PARK 0.61 51992          16563          18240     0.48
##  3:         ARCHER HEIGHTS 0.57 13169           3900           4291     0.78
##  4:          ARMOUR SQUARE 0.59 13396           5192           5708     0.04
##  5:                ASHBURN 0.75 43792          12982          13612     0.37
##  6:         AUBURN GRESHAM 0.57 46278          17111          20614     0.02
##  7:                 AUSTIN 0.53 95260          31766          37248     0.13
##  8:            AVALON PARK 0.66 10034           3898           4574     0.01
##  9:               AVONDALE 0.55 37368          13335          15109     0.57
## 10:         BELMONT CRAGIN 0.53 79956          22131          24407     0.80
## 11:                BEVERLY 0.77 20822           7673           8048     0.05
## 12:             BRIDGEPORT 0.57 33696          12751          13845     0.24
## 13:          BRIGHTON PARK 0.47 44813          12314          13941     0.83
## 14:               BURNSIDE 0.56  2254            965           1315     0.01
## 15:        CALUMET HEIGHTS 0.69 13188           5271           6044     0.03
## 16:                CHATHAM 0.54 31071          13650          16725     0.01
## 17:           CHICAGO LAWN 0.50 53098          16001          18997     0.46
## 18:               CLEARING 0.68 25891           8760           9242     0.52
## 19:                DOUGLAS 0.57 20781           9590          10786     0.03
## 20:                DUNNING 0.72 43689          15616          16624     0.27
## 21:     EAST GARFIELD PARK 0.46 20004           6819           8336     0.04
## 22:              EAST SIDE 0.61 23771           6843           7817     0.81
## 23:              EDGEWATER 0.68 49478          25088          28241     0.17
## 24:            EDISON PARK 0.76 11753           4638           4968     0.10
## 25:              ENGLEWOOD 0.37 25075           9164          14227     0.03
## 26:            FOREST GLEN 0.81 18997           6788           7223     0.14
## 27:            FULLER PARK 0.43  2354           1045           1484     0.06
## 28:              GAGE PARK 0.51 37943           8819           9762     0.93
## 29:         GARFIELD RIDGE 0.74 36452          12179          12847     0.48
## 30:        GRAND BOULEVARD 0.52 17100           7911           9546     0.02
## 31: GREATER GRAND CROSSING 0.46 31766          12129          15638     0.01
## 32:              HEGEWISCH 0.62  9384           3499           3943     0.57
## 33:                HERMOSA 0.51 24144           7022           7792     0.85
## 34:          HUMBOLDT PARK 0.45 56427          16813          19627     0.55
## 35:              HYDE PARK 0.59 23749          10909          12448     0.09
## 36:            IRVING PARK 0.63 54606          20236          22526     0.42
## 37:         JEFFERSON PARK 0.72 26808          10359          11174     0.22
## 38:                KENWOOD 0.55 11812           5894           6691     0.02
## 39:              LAKE VIEW 0.68 81384          40927          44727     0.08
## 40:           LINCOLN PARK 0.65 62961          29483          32357     0.06
## 41:         LINCOLN SQUARE 0.70 41715          18296          19703     0.16
## 42:           LOGAN SQUARE 0.61 73927          29573          32815     0.41
## 43:                   LOOP 0.56 18469           8553          10218     0.08
## 44:        LOWER WEST SIDE 0.44 32888          11947          13716     0.75
## 45:          MCKINLEY PARK 0.55 15767           5140           5554     0.60
## 46:              MONTCLARE 0.62 13784           4512           5040     0.58
## 47:            MORGAN PARK 0.70 22375           8070           8980     0.04
## 48:        MOUNT GREENWOOD 0.75 19277           6595           7032     0.09
## 49:        NEAR NORTH SIDE 0.57 78237          48148          56904     0.06
## 50:        NEAR SOUTH SIDE 0.74  4605           2424           2535     0.02
## 51:         NEAR WEST SIDE 0.60 62872          28162          31242     0.09
## 52:               NEW CITY 0.39 36421          11098          14165     0.58
## 53:           NORTH CENTER 0.69 35789          14488          16227     0.11
## 54:         NORTH LAWNDALE 0.42 34069          10653          13795     0.08
## 55:             NORTH PARK 0.68 18842           6514           7035     0.17
## 56:           NORWOOD PARK 0.78 36936          14750          15757     0.12
## 57:                OAKLAND 0.62  3337           1322           1462     0.03
## 58:                  OHARE 0.63 12139           6015           6536     0.07
## 59:           PORTAGE PARK 0.64 64313          22693          25024     0.40
## 60:                PULLMAN 0.59  6613           3083           3480     0.06
## 61:              RIVERDALE 0.53  7394           2428           3304     0.04
## 62:            ROGERS PARK 0.63 49651          22218          25355     0.22
## 63:               ROSELAND 0.56 39164          13102          16412     0.01
## 64:          SOUTH CHICAGO 0.45 24444           9168          12332     0.18
## 65:          SOUTH DEERING 0.61 14614           5009           5841     0.31
## 66:         SOUTH LAWNDALE 0.40 74851          17824          20965     0.91
## 67:            SOUTH SHORE 0.47 45535          19599          25798     0.01
## 68:                 UPTOWN 0.67 57973          29341          32599     0.14
## 69:     WASHINGTON HEIGHTS 0.66 27453           9570          10470     0.01
## 70:        WASHINGTON PARK 0.43 11502           4334           5676     0.02
## 71:            WEST ELSDON 0.66 19210           5137           5463     0.82
## 72:         WEST ENGLEWOOD 0.42 29929           9642          12889     0.06
## 73:     WEST GARFIELD PARK 0.41 17155           5335           7472     0.02
## 74:              WEST LAWN 0.64 33108           8988           9807     0.80
## 75:           WEST PULLMAN 0.57 27733           9133          11216     0.06
## 76:             WEST RIDGE 0.64 76215          25194          28185     0.19
## 77:              WEST TOWN 0.59 83621          37127          40475     0.24
## 78:               WOODLAWN 0.46 21875           8639          11201     0.03
##                  Community Resp   Pop Occ Households Tot Households Hisp Pct

Final map output

The final output this time was a map of the Low Response Score, which is the estimated non-response.

##------------------------------------------------------------------------------
## TMAP tract map
##------------------------------------------------------------------------------
library(sf)
## Linking to GEOS 3.7.1, GDAL 2.2.3, PROJ 4.9.3
library(tmap)

shp <- shp_tracts_2020
shp@data <- cbind(shp@data,
                  pdb[i = match(shp@data$TRACT, 
                                TRACT_2020),
                      j = list(Low_Response_Score,
                               Tot_Population_CEN_2010,
                               Tot_Housing_Units_CEN_2010,
                               NH_Blk_alone_ACS_13_17,
                               Hispanic_ACS_13_17,
                               ENG_VW_ACS_13_17)],
                  resp_current[i = match(shp@data$TRACT, 
                                         TRACT), 
                               j = list(resp = CRRALL/100)])
## convert shape file to tmap format
shp_sf <- st_as_sf(shp, crs = 4326)

popups <- c("NAME", "Low_Response_Score", "Tot_Population_CEN_2010", 
            "Tot_Housing_Units_CEN_2010", "NH_Blk_alone_ACS_13_17",
            "Hispanic_ACS_13_17", "ENG_VW_ACS_13_17")
breaks <- seq(0, 45, by = 5)
map1 <- tm_shape(shp_sf[sf::st_is_valid(shp_sf), ], 
                 is.master = TRUE) + 
  tm_polygons(col = "Low_Response_Score", 
              palette = "YlOrRd", 
              style="cont", 
              alpha = .7, 
              title = "Low Response Score", 
              breaks = breaks, 
              popup.var = popups, 
              group = "Relevant Community Area Census Tracts") +
  tm_layout(title = "Heatmap of Low Response Scores") +
  tm_view(view.legend.position = c("left", "bottom"))  +
  tm_basemap(server = "OpenMapSurfer.Roads") +
  tm_shape(chi_community_areas) +
  tm_borders(col = "blue", lwd=3, group = paste0("Community Areas", " Outline"))
tmap_mode(mode = c("plot", "view")[2])
## tmap mode set to interactive viewing
map1

Overall shape of low response score.

hist(shp@data$Low_Response_Score)